#include "statsxx/machine_learning/Ensemble.hpp"


// STL
#include <cmath>               // std::log(), ::exp()
#include <random>              // std::mt19937, ::discrete_distribution<>
#include <vector>              // std::vector<>

// jScience
#include "jScience/linalg.hpp" // Matrix<>, Vector<>, normalize()
#include "jrandnum.hpp"        // prng_Mersenne_twister()


//
// DESC: Train learners (serially) by the Real AdaBoost method.
//
// -----
//
// NOTE: The following algorithm is *based* on the original Real version of the AdaBoost method.
//
// NOTE: ... See:
//
//     J. Friedman, T. Hastie, and R. Tibshirani, "Additive Logistic Regression: a Statistical View of Boosting" (1998)
//
// -----
//
// TODO: NOTE: Recall that Boosting in Ensemble is only implemented in a two-classification setting.
//
template<typename T>
inline void Ensemble<T>::train_Real_AdaBoost(
                                             const Matrix<double> &X_in,
                                             const Matrix<double> &X_out
                                             )
{
    //=========================================================
    // INITIALIZE
    //=========================================================

    // NOTE: The class values used in Real AdaBoost are in {-1, 1}.
    //
    // NOTE: ... We therefore need to convert {0, 1} in X_out to this range.

    std::vector<double> y(X_out.size(0));

    for( auto i = 0; i < X_out.size(0); ++i )
    {
        y[i] = ( 2*X_out(i,0) - 1. );
    }

    // NOTE: The following (numbering, etc.) follows Algorithm 2 in Friedman, et al.

    //=========================================================
    // STEP 1. Start with weights w_i = 1/N, i = 0, 1, ..., (X_in.size(0)-1).
    //=========================================================

    std::vector<double> w_X(X_in.size(0), (1./X_in.size(0)));

    //=========================================================
    // STEP 2. Repeat for i = 0, 1, ..., (this->learners.size()-1):
    //=========================================================

    for( auto i = 0; i < this->learners.size(); ++i )
    {
        //---------------------------------------------------------
        // STEP 2. (a) Fit classifier (i) to obtain class probability estimates, using weights w_i on the training data.
        //---------------------------------------------------------

        // (WEIGHTED) BOOTSTRAP DATA

        // NOTE: Because each Learner (of the Ensemble) only has access to its .f_train() function, the training data cannot literally be weighted.
        //
        // NOTE: ... We can achieve effectively the same result (maybe better?) by performing a weighted bootstrap.

        // TODO: NOTE: Perhaps the following distribution-type functionality is to go with the jrandnum subroutines.

        std::mt19937 prng = prng_Mersenne_twister();
        std::discrete_distribution<int> distribution(w_X.begin(), w_X.end());

        Matrix<double> X_in_i(X_in.size(0),X_in.size(1));
        Matrix<double> X_out_i(X_out.size(0),X_out.size(1));

        for( auto j = 0; j < X_in.size(0); ++j )
        {
            int r = distribution(prng);

            for( auto k = 0; k < X_in.size(1); ++k )
            {
                X_in_i(j,k) = X_in(r,k);
            }

            for( auto k = 0; k < X_out.size(1); ++k )
            {
                X_out_i(j,k) = X_out(r,k);
            }
        }

        // TRAIN
        this->learners[i].f_train(
                                  X_in_i,
                                  X_out_i
                                  );

        //---------------------------------------------------------
        // STEP 2. (b) Set f_m(x).
        //---------------------------------------------------------

        std::vector<double> f_m(X_in.size(0));

        for( auto j = 0; j < X_in.size(0); ++j )
        {
            std::vector<double> output = this->learners[i].f_evaluate(
                                                                      X_in.row(j).std_vector()
                                                                      );

            // NOTE: The factor of (1/2) (in the following) arises from ensuring that f_m is the minimizer of:
            //
            //     E[ exp(-y*f_m) ]
            //
            // NOTE: See Lemma 1 of Friedman, et al.

            f_m[j] = 0.5*std::log( (output[0]/(1. - output[0])) );
        }

        //---------------------------------------------------------
        // STEP 2. (c) Set w_i <-- w_i*exp(-y_i*f_m(x_i)), ...
        //---------------------------------------------------------

        for( auto j = 0; j < X_in.size(0); ++j )
        {
            w_X[j] *= std::exp(-y[j]*f_m[j]);
        }

        // ... and renormalize
        w_X = normalize(Vector<double>(w_X)).std_vector();
    }

    //=========================================================
    // CLEANUP AND RETURN
    //=========================================================

    // SET EVALUATION FLAG
    _evaluate_Boost = true;
}
